//===- config_ag_gemm_kernel_sm80_A100_tp8_nnodes1.cu ------------------ C++ ---===//
//
// Copyright 2025 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

// clang-format off
#include "flux/op_registry.h"
namespace bytedance::flux {
using namespace cute;

static int config_ag_gemm_kernel_sm80_a100_tp8_nnodes1 = []() {
  auto &inst = TuningConfigRegistry::instance();
  // PCIE
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,256l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_BF16{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_FP16{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  // NVLink
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(256,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  // inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,2048,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(8192,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(2048,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_FP16{}(),_FP16{}(),_Void{}(),_FP16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(1024,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  // Train
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1024,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1024,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1024,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1024,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1280,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1280,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1280,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1280,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1536,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1536,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1536,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1536,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,1536,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,1536,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3072,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3072,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3584,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3584,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,3584,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,3584,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,4608,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,4608,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,6144,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,6144,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(4096,7168,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(4096,7168,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  // Inference
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1024,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1024,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1024,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1024,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1024,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1280,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1280,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1280,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1280,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1536,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1536,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1536,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1536,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,1536,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,1536,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3072,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3072,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3584,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3584,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3584,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,3584,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,3584,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,4608,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,4608,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,6144,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,6144,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(16384,7168,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,256l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(16384,7168,49152,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(128l,128l,64l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  // Small M inference
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,256l,32l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),3,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(64,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(64,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(64l,128l,64l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,3072,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,3072,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),3,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,4608,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,4608,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(128l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,6144,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkDP{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,7168,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RRR{}(),_GemmV2{}()),make_runtime_config(512,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_BF16{}(),_BF16{}(),_Void{}(),_BF16{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}()),make_runtime_config(512,7168,12288,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,64l,32l),cute::make_tuple(16l,8l,16l),_StreamkSK{}()),None{},cute::make_tuple(256l,128l,32l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  /// EP Serving
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_BF16{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(512,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterHeuristic{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_Void{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(512,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterHeuristic{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_BF16{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(256,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_Void{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(256,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongN{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_BF16{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(128,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_Void{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(128,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_BF16{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(64,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  inst.add(make_gemm_meta(make_gemm_dtype_config(_S8{}(),_S8{}(),_Void{}(),_BF16{}(),_S32{}()),_Sm80{}(),_A100{}(),_AGKernel{}(),_RCR{}(),_GemmV2{}(),make_gemm_v2_meta(false),None{}),make_runtime_config(64,1280,8192,make_all_gather_runtime_config(8,1,0)),make_gemm_hparams(make_gemm_v2_hparams(cute::make_tuple(64l,32l,128l),cute::make_tuple(16l,8l,32l),_StreamkSK{}()),None{},cute::make_tuple(128l,64l,128l),_GemmStreamK{}(),4,_RasterAlongM{}()));
  return 0;
}();
}
// clang-format on
